library(socratadata)
library(dplyr)
library(lubridate)
library(janitor)
library(readr)
library(ggplot2)
library(ggalluvial)NYC Building Violation Analysis EDAV Project
housing_filtered <- read.csv("/Users/shreyashetty/Documents/Fall 2025 Courses/EDAV/Final_Project/Housing_Maintenance_Code_Violations_20251125.csv")
cat(nrow(housing_filtered))143112
sr311_filtered <- read.csv("/Users/shreyashetty/Documents/Fall 2025 Courses/EDAV/Final_Project/311_Service_Requests_from_2010_to_Present_20251125.csv")
cat(nrow(sr311_filtered))701761
# view(housing_filtered)
# view(sr311_filtered)
head(housing_filtered) ViolationID BuildingID RegistrationID BoroID Borough HouseNumber
1 17442503 11076 100711 1 MANHATTAN 142
2 17576658 27937 108549 1 MANHATTAN 121
3 17964149 19366 107245 1 MANHATTAN 171
4 17601375 42010 105105 1 MANHATTAN 202
5 17601376 42010 105105 1 MANHATTAN 202
6 17390227 42340 125158 1 MANHATTAN 450
LowHouseNumber HighHouseNumber StreetName StreetCode Postcode
1 142 148 EAST 7 STREET 17130 10009
2 121 121 ST NICHOLAS AVENUE 31190 10026
3 171 171 EAST 111 STREET 19190 10029
4 202 216 WEST 146 STREET 36690 10039
5 202 216 WEST 146 STREET 36690 10039
6 450 450 WEST 149 STREET 36750 10031
Apartment Story Block Lot Class InspectionDate ApprovedDate
1 5A 1 402 15 C 11/25/2024 11/25/2024
2 2B 2 1922 24 B 12/19/2024 12/20/2024
3 1E 1 1639 29 B 06/05/2025 06/11/2025
4 4L 4 2031 38 B 01/06/2025 01/06/2025
5 4L 4 2031 38 B 01/06/2025 01/06/2025
6 7 1 2063 50 B 11/01/2024 11/01/2024
OriginalCertifyByDate OriginalCorrectByDate NewCertifyByDate NewCorrectByDate
1 12/15/2024 12/10/2024
2 02/10/2025 01/27/2025
3 07/31/2025 07/17/2025
4 02/25/2025 02/11/2025
5 02/25/2025 02/11/2025
6 12/23/2024 12/09/2024
CertifiedDate OrderNumber NOVID
1 12/13/2024 530 9458247
2 07/30/2025 1503 9549552
3 583 9847172
4 529 9567076
5 502 9567076
6 11/12/2024 1503 9410632
NOVDescription
1 § 27-2005, 27-2007, 27-2041.1 HMC, §238, § 309; § 107 (2) ( C) MDL AND 28 RCNY §25-171: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE LATCHSET IN THE ENTRANCE LOCATED AT APT 5A, 1st STORY, 2nd APARTMENT FROM EAST AT SOUTH
2 § 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). DEFECTIVE IN THE ENTIRE APARTMENT LOCATED AT APT 2B, 2nd STORY, 2nd APARTMENT FROM NORTH AT EAST
3 § 27-2026, 2027 HMC: PROPERLY REPAIR THE SOURCE AND ABATE THE EVIDENCE OF A WATER LEAK AT CEILING IN THE BATHROOM LOCATED AT APT 1E, 1st STORY, 1st APARTMENT FROM NORTH AT EAST
4 § 27-2005 HMC: REFIT\032 1ST.WINDOW FROM WEST AT NORTH IN THE 2nd ROOM FROM EAST LOCATED AT APT 4L, 4th STORY, 3rd APARTMENT FROM WEST AT NORTH
5 § 27-2005 ADM CODE PROPERLY REPAIR WITH SIMILAR MATERIAL THE BROKEN OR DEFECTIVE SILICONE CAULKING AT BATHTUB IN THE BATHROOM LOCATED AT APT 4L, 4th STORY, 3rd APARTMENT FROM WEST AT NORTH
6 § 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 1st STORY, 2nd APARTMENT FROM SOUTH AT WEST
NOVIssuedDate CurrentStatusID CurrentStatus CurrentStatusDate NovType
1 11/26/2024 9 VIOLATION DISMISSED 02/24/2025 Original
2 12/23/2024 4 NOV CERTIFIED LATE 07/30/2025 Original
3 06/12/2025 2 NOV SENT OUT 06/12/2025 Original
4 01/07/2025 2 NOV SENT OUT 01/07/2025 Original
5 01/07/2025 2 NOV SENT OUT 01/07/2025 Original
6 11/04/2024 19 VIOLATION CLOSED 11/22/2024 Original
ViolationStatus RentImpairing Latitude Longitude CommunityBoard
1 Close N 40.72576 -73.98287 3
2 Open N 40.80370 -73.95271 10
3 Open N 40.79517 -73.94330 11
4 Open N 40.82245 -73.93894 10
5 Open N 40.82245 -73.93894 10
6 Close N 40.82737 -73.94479 9
CouncilDistrict CensusTract BIN BBL NTA
1 2 32 1005084 1004020015 East Village
2 9 218 1058377 1019220024 Harlem (South)
3 8 172 1052203 1016390029 East Harlem (South)
4 9 232 1060487 1020310038 Harlem (North)
5 9 232 1060487 1020310038 Harlem (North)
6 7 231 1061525 1020630050 Hamilton Heights-Sugar Hill
colnames(housing_filtered) [1] "ViolationID" "BuildingID" "RegistrationID"
[4] "BoroID" "Borough" "HouseNumber"
[7] "LowHouseNumber" "HighHouseNumber" "StreetName"
[10] "StreetCode" "Postcode" "Apartment"
[13] "Story" "Block" "Lot"
[16] "Class" "InspectionDate" "ApprovedDate"
[19] "OriginalCertifyByDate" "OriginalCorrectByDate" "NewCertifyByDate"
[22] "NewCorrectByDate" "CertifiedDate" "OrderNumber"
[25] "NOVID" "NOVDescription" "NOVIssuedDate"
[28] "CurrentStatusID" "CurrentStatus" "CurrentStatusDate"
[31] "NovType" "ViolationStatus" "RentImpairing"
[34] "Latitude" "Longitude" "CommunityBoard"
[37] "CouncilDistrict" "CensusTract" "BIN"
[40] "BBL" "NTA"
head(sr311_filtered) Unique.Key Created.Date Closed.Date Agency
1 66674892 10/31/2025 11:59:47 PM 11/01/2025 12:05:10 AM DSNY
2 66675637 10/31/2025 11:59:32 PM 11/01/2025 01:12:16 AM NYPD
3 66685926 10/31/2025 11:59:10 PM 11/01/2025 04:52:29 AM NYPD
4 66677254 10/31/2025 11:58:27 PM 11/01/2025 12:35:08 AM NYPD
5 66675675 10/31/2025 11:58:20 PM 10/31/2025 11:59:23 PM NYPD
6 66685266 10/31/2025 11:57:28 PM 11/01/2025 12:46:31 AM NYPD
Agency.Name Complaint.Type Descriptor
1 Department of Sanitation Vendor Enforcement Food Vendor
2 New York City Police Department Noise - Residential Loud Talking
3 New York City Police Department Noise - Residential Loud Music/Party
4 New York City Police Department Noise - Residential Loud Music/Party
5 New York City Police Department Noise - Vehicle Car/Truck Music
6 New York City Police Department Noise - Street/Sidewalk Loud Music/Party
Location.Type Incident.Zip Incident.Address Street.Name
1 Street 10002 9 DELANCY STREET DELANCY STREET
2 Residential Building/House 10025 249 WEST 109 STREET WEST 109 STREET
3 Residential Building/House 10004 25 BROAD STREET BROAD STREET
4 Residential Building/House 10025 550 WEST 114 STREET WEST 114 STREET
5 Street/Sidewalk 10009 87 ST MARKS PLACE ST MARKS PLACE
6 Street/Sidewalk 10026 140 WEST 113 STREET WEST 113 STREET
Cross.Street.1 Cross.Street.2 Intersection.Street.1
1 BOWERY CHRYSTIE STREET BOWERY
2 AMSTERDAM AVENUE BROADWAY AMSTERDAM AVENUE
3 EXCHANGE PLACE BEAVER STREET EXCHANGE PLACE
4 AMSTERDAM AVENUE BROADWAY AMSTERDAM AVENUE
5 1 AVENUE AVENUE A 1 AVENUE
6 ST NICHOLAS AVENUE ADAM CLAYTON POWELL JR BOULEVARD ST NICHOLAS AVENUE
Intersection.Street.2 Address.Type City Landmark
1 CHRYSTIE STREET ADDRESS NEW YORK DELANCEY STREET
2 BROADWAY ADDRESS NEW YORK WEST 109 STREET
3 BEAVER STREET ADDRESS NEW YORK BROAD STREET
4 BROADWAY ADDRESS NEW YORK WEST 114 STREET
5 AVENUE A ADDRESS NEW YORK ST MARKS PLACE
6 ADAM CLAYTON POWELL JR BOULEVARD ADDRESS NEW YORK WEST 113 STREET
Facility.Type Status Due.Date
1 Closed
2 Closed
3 Closed
4 Closed
5 Closed
6 Closed
Resolution.Description
1 N/A
2 The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.
3 The Police Department responded to the complaint and determined that police action was not necessary.
4 The Police Department responded to the complaint and took action to fix the condition.
5 The Police Department responded to the complaint and took action to fix the condition.
6 The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.
Resolution.Action.Updated.Date Community.Board BBL Borough
1 11/01/2025 12:05:13 AM 03 MANHATTAN 1004240026 MANHATTAN
2 11/01/2025 01:12:19 AM 07 MANHATTAN 1018810008 MANHATTAN
3 11/01/2025 04:52:35 AM 01 MANHATTAN 1000257502 MANHATTAN
4 11/01/2025 12:35:12 AM 09 MANHATTAN 1018850156 MANHATTAN
5 10/31/2025 11:59:26 PM 03 MANHATTAN 1004360057 MANHATTAN
6 11/01/2025 12:46:34 AM 10 MANHATTAN 1018220057 MANHATTAN
X.Coordinate..State.Plane. Y.Coordinate..State.Plane. Open.Data.Channel.Type
1 986,083 201,633 PHONE
2 993,702 231,848 PHONE
3 981,128 196,533 ONLINE
4 994,254 233,029 MOBILE
5 988,427 204,307 PHONE
6 997,281 231,054 ONLINE
Park.Facility.Name Park.Borough Vehicle.Type Taxi.Company.Borough
1 Unspecified MANHATTAN
2 Unspecified MANHATTAN
3 Unspecified MANHATTAN
4 Unspecified MANHATTAN
5 Unspecified MANHATTAN Car
6 Unspecified MANHATTAN
Taxi.Pick.Up.Location Bridge.Highway.Name Bridge.Highway.Direction Road.Ramp
1
2
3
4
5
6
Bridge.Highway.Segment Latitude Longitude
1 40.72011 -73.99339
2 40.80304 -73.96586
3 40.70611 -74.01126
4 40.80628 -73.96386
5 40.72745 -73.98493
6 40.80086 -73.95293
Location
1 (40.72011254060361, -73.99338737853634)
2 (40.80304007929047, -73.96585904688254)
3 (40.706113895406446, -74.01126037586627)
4 (40.80628099138207, -73.9638634441027)
5 (40.72745122642037, -73.9849296444645)
6 (40.80085621357295, -73.95293311739029)
colnames(sr311_filtered) [1] "Unique.Key" "Created.Date"
[3] "Closed.Date" "Agency"
[5] "Agency.Name" "Complaint.Type"
[7] "Descriptor" "Location.Type"
[9] "Incident.Zip" "Incident.Address"
[11] "Street.Name" "Cross.Street.1"
[13] "Cross.Street.2" "Intersection.Street.1"
[15] "Intersection.Street.2" "Address.Type"
[17] "City" "Landmark"
[19] "Facility.Type" "Status"
[21] "Due.Date" "Resolution.Description"
[23] "Resolution.Action.Updated.Date" "Community.Board"
[25] "BBL" "Borough"
[27] "X.Coordinate..State.Plane." "Y.Coordinate..State.Plane."
[29] "Open.Data.Channel.Type" "Park.Facility.Name"
[31] "Park.Borough" "Vehicle.Type"
[33] "Taxi.Company.Borough" "Taxi.Pick.Up.Location"
[35] "Bridge.Highway.Name" "Bridge.Highway.Direction"
[37] "Road.Ramp" "Bridge.Highway.Segment"
[39] "Latitude" "Longitude"
[41] "Location"
# ==== Libraries ====
library(dplyr)
library(lubridate)
housing <- housing_filtered |>
mutate(
# standardize all date columns
inspectiondate = parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
approveddate = parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
originalcertifybydate = parse_date_time(OriginalCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
originalcorrectbydate = parse_date_time(OriginalCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
newcertifybydate = parse_date_time(NewCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
newcorrectbydate = parse_date_time(NewCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
certifieddate = parse_date_time(CertifiedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
novissueddate = parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
currentstatusdate = parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
borough_std = toupper(Borough)
) |>
# subset only Manhattan & date range
filter(
borough_std == "MANHATTAN",
inspectiondate >= as.Date("2024-11-01") & inspectiondate <= as.Date("2025-11-01")
)
# check
cat("Housing Violations Data Rows after filtering:", nrow(housing), "\n")Housing Violations Data Rows after filtering: 143112
library(dplyr)
library(lubridate)
sr311 <- sr311_filtered |>
mutate(
# standardize date columns
created_date = parse_date_time(Created.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
closed_date = parse_date_time(Closed.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
borough_std = toupper(Borough)
) |>
# subset only Manhattan & last 1 year
filter(
borough_std == "MANHATTAN",
created_date >= as.Date("2024-11-01") & created_date <= as.Date("2025-11-01")
)
# check
cat("311 Service Requests Data Rows after filtering:", nrow(sr311), "\n")311 Service Requests Data Rows after filtering: 701761
# housing <- housing_filtered
# sr311 <- sr311_filtered
cat("Housing Violations rows:", nrow(housing), "\n")Housing Violations rows: 143112
cat("311 Service Requests rows:", nrow(sr311), "\n")311 Service Requests rows: 701761
colnames(housing) [1] "ViolationID" "BuildingID" "RegistrationID"
[4] "BoroID" "Borough" "HouseNumber"
[7] "LowHouseNumber" "HighHouseNumber" "StreetName"
[10] "StreetCode" "Postcode" "Apartment"
[13] "Story" "Block" "Lot"
[16] "Class" "InspectionDate" "ApprovedDate"
[19] "OriginalCertifyByDate" "OriginalCorrectByDate" "NewCertifyByDate"
[22] "NewCorrectByDate" "CertifiedDate" "OrderNumber"
[25] "NOVID" "NOVDescription" "NOVIssuedDate"
[28] "CurrentStatusID" "CurrentStatus" "CurrentStatusDate"
[31] "NovType" "ViolationStatus" "RentImpairing"
[34] "Latitude" "Longitude" "CommunityBoard"
[37] "CouncilDistrict" "CensusTract" "BIN"
[40] "BBL" "NTA" "inspectiondate"
[43] "approveddate" "originalcertifybydate" "originalcorrectbydate"
[46] "newcertifybydate" "newcorrectbydate" "certifieddate"
[49] "novissueddate" "currentstatusdate" "borough_std"
colnames(sr311) [1] "Unique.Key" "Created.Date"
[3] "Closed.Date" "Agency"
[5] "Agency.Name" "Complaint.Type"
[7] "Descriptor" "Location.Type"
[9] "Incident.Zip" "Incident.Address"
[11] "Street.Name" "Cross.Street.1"
[13] "Cross.Street.2" "Intersection.Street.1"
[15] "Intersection.Street.2" "Address.Type"
[17] "City" "Landmark"
[19] "Facility.Type" "Status"
[21] "Due.Date" "Resolution.Description"
[23] "Resolution.Action.Updated.Date" "Community.Board"
[25] "BBL" "Borough"
[27] "X.Coordinate..State.Plane." "Y.Coordinate..State.Plane."
[29] "Open.Data.Channel.Type" "Park.Facility.Name"
[31] "Park.Borough" "Vehicle.Type"
[33] "Taxi.Company.Borough" "Taxi.Pick.Up.Location"
[35] "Bridge.Highway.Name" "Bridge.Highway.Direction"
[37] "Road.Ramp" "Bridge.Highway.Segment"
[39] "Latitude" "Longitude"
[41] "Location" "created_date"
[43] "closed_date" "borough_std"
library(dplyr)
library(ggplot2)
library(ggalluvial)
library(lubridate)
library(tidyr)
library(scales)
# ---------- 1) Ensure Housing Dates are Date class ----------
housing <- housing_filtered |>
mutate(
# inspectiondate = as.Date(parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
# approveddate = as.Date(parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
# novissueddate = as.Date(parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
# currentstatusdate = as.Date(parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")))
# standardize all date columns
inspectiondate = parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
approveddate = parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
originalcertifybydate = parse_date_time(OriginalCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
originalcorrectbydate = parse_date_time(OriginalCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
newcertifybydate = parse_date_time(NewCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
newcorrectbydate = parse_date_time(NewCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
certifieddate = parse_date_time(CertifiedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
novissueddate = parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
currentstatusdate = parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
# borough_std = toupper(Borough)
)
# ---------- 2) Ensure 311 Dates are Date class ----------
sr311 <- sr311_filtered |>
mutate(
created_date = as.Date(parse_date_time(Created.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
closed_date = as.Date(parse_date_time(Closed.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")))
)
# ---------- 3) Housing Violations Timeline ----------
housing |>
filter(!is.na(inspectiondate)) |>
count(inspectiondate) |>
ggplot(aes(x = inspectiondate, y = n)) +
geom_line(color = "red") +
labs(title = "Housing Violations Timeline (Manhattan, Last Year)", x = "Inspection Date", y = "Count") +
theme_minimal()# ---------- 4) 311 Complaints Timeline ----------
sr311 |>
filter(!is.na(created_date)) |>
count(created_date) |>
ggplot(aes(x = created_date, y = n)) +
geom_line(color = "orange") +
labs(title = "311 Complaints Timeline (Manhattan, Last Year)", x = "Created Date", y = "Count") +
theme_minimal()# ---------- 5) Monthly aggregated Housing vs 311 ----------
housing_monthly <- housing |>
filter(!is.na(inspectiondate)) |>
mutate(month = floor_date(inspectiondate, "month")) |>
group_by(month) |>
summarise(HousingCount = n(), .groups = "drop")
sr311_monthly <- sr311 |>
filter(!is.na(created_date)) |>
mutate(month = floor_date(created_date, "month")) |>
group_by(month) |>
summarise(ComplaintCount = n(), .groups = "drop")
monthly_combined <- housing_monthly |>
left_join(sr311_monthly, by = "month") |>
pivot_longer(cols = c(HousingCount, ComplaintCount), names_to = "Source", values_to = "Count")
monthly_combined |>
ggplot(aes(axis1 = month, axis2 = Source, y = Count)) +
geom_alluvium(aes(fill = Source), width = 0.25) +
geom_stratum(width = 0.25) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
theme_minimal() +
labs(title = "Monthly Counts: Housing Violations vs 311 Complaints", y = "Count")library(dplyr)
library(ggplot2)
library(ggalluvial)
library(lubridate)
library(scales)
library(tidyr)
# ---------- 3) Quick row counts ----------
cat("Housing Violations (Manhattan, 1yr):", nrow(housing_filtered), "rows\n")Housing Violations (Manhattan, 1yr): 143112 rows
cat("311 Service Requests (Manhattan, 1yr):", nrow(sr311_filtered), "rows\n")311 Service Requests (Manhattan, 1yr): 701761 rows
cat("Housing Violations rows:", nrow(housing), "\n")Housing Violations rows: 143112
cat("311 Service Requests rows:", nrow(sr311), "\n")311 Service Requests rows: 701761
# ---------- 4) Alluvial plot: Housing Class -> Current Status ----------
housing_filtered |>
filter(!is.na(Class), !is.na(CurrentStatusID)) |>
count(Class, CurrentStatusID) |>
ggplot(aes(axis1 = Class, axis2 = CurrentStatusID, y = n)) +
geom_alluvium(aes(fill = Class), width = 0.25) +
geom_stratum(width = 0.25) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
theme_minimal() +
labs(title = "Housing Violations: Class → Current Status", y = "Count")# ---------- 5) 311 Complaint Type Counts ----------
sr311_filtered |>
filter(!is.na(Complaint.Type)) |>
count(Complaint.Type) |>
slice_max(n, n = 15) |>
ggplot(aes(x = reorder(Complaint.Type, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 15 311 Complaint Types (Manhattan, Last Year)", x = "", y = "Count") +
theme_minimal()# ---------- 6) Housing Violations by Street ----------
housing_filtered |>
filter(!is.na(StreetName)) |>
count(StreetName) |>
slice_max(n, n = 20) |>
ggplot(aes(x = reorder(StreetName, n), y = n)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 20 Manhattan Streets by Housing Violations (Last Year)", x = "", y = "Count") +
theme_minimal()# ---------- 7) 311 Complaints by Zip ----------
sr311_filtered |>
filter(!is.na(Incident.Zip)) |>
count(Incident.Zip) |>
slice_max(n, n = 20) |>
ggplot(aes(x = reorder(Incident.Zip, n), y = n)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Top 20 ZIP Codes by 311 Complaints (Manhattan, Last Year)", x = "", y = "Count") +
theme_minimal()library(dplyr)
library(ggplot2)
library(ggalluvial)
library(lubridate)
library(tidyr)
library(scales)
# ---------- 1) Standardize Housing Dates ----------
housing <- housing_filtered |>
mutate(
# inspectiondate = as.Date(parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
# approveddate = as.Date(parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
# novissueddate = as.Date(parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
# currentstatusdate = as.Date(parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
# borough_std = toupper(Borough)
inspectiondate = parse_date_time(InspectionDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
approveddate = parse_date_time(ApprovedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
originalcertifybydate = parse_date_time(OriginalCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
originalcorrectbydate = parse_date_time(OriginalCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
newcertifybydate = parse_date_time(NewCertifyByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
newcorrectbydate = parse_date_time(NewCorrectByDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
certifieddate = parse_date_time(CertifiedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
novissueddate = parse_date_time(NOVIssuedDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
currentstatusdate = parse_date_time(CurrentStatusDate, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY")),
borough_std = toupper(Borough)
) |>
filter(borough_std == "MANHATTAN", inspectiondate >= as.Date("2024-11-01") & inspectiondate <= as.Date("2025-11-01"))
# ---------- 2) Standardize 311 Dates ----------
sr311 <- sr311_filtered |>
mutate(
created_date = as.Date(parse_date_time(Created.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
closed_date = as.Date(parse_date_time(Closed.Date, orders = c("Ymd HMS", "Ymd", "mdY HMS", "mdY"))),
borough_std = toupper(Borough)
) |>
filter(borough_std == "MANHATTAN", created_date >= as.Date("2024-11-01") & created_date <= as.Date("2025-11-01"))
# ---------- 3) Quick Row Counts ----------
cat("Housing Violations Rows:", nrow(housing), "\n")Housing Violations Rows: 143112
cat("311 Service Requests Rows:", nrow(sr311), "\n")311 Service Requests Rows: 701761
# ---------- 4) Alluvial: Housing Class -> Current Status ----------
housing |>
filter(!is.na(Class), !is.na(CurrentStatusID)) |>
count(Class, CurrentStatusID) |>
ggplot(aes(axis1 = Class, axis2 = CurrentStatusID, y = n)) +
geom_alluvium(aes(fill = Class), width = 0.25) +
geom_stratum(width = 0.25) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
theme_minimal() +
labs(title = "Housing Violations: Class → Current Status", y = "Count")# ---------- 5) Top 311 Complaint Types ----------
sr311 |>
filter(!is.na(Complaint.Type)) |>
count(Complaint.Type) |>
slice_max(n, n = 15) |>
ggplot(aes(x = reorder(Complaint.Type, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 15 311 Complaint Types (Manhattan, Last Year)", x = "", y = "Count") +
theme_minimal()# ---------- 6) Housing Violations by Street ----------
housing |>
filter(!is.na(StreetName)) |>
count(StreetName) |>
slice_max(n, n = 20) |>
ggplot(aes(x = reorder(StreetName, n), y = n)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 20 Manhattan Streets by Housing Violations", x = "", y = "Count") +
theme_minimal()# ---------- 7) 311 Complaints by ZIP ----------
sr311 |>
filter(!is.na(Incident.Zip)) |>
count(Incident.Zip) |>
slice_max(n, n = 20) |>
ggplot(aes(x = reorder(Incident.Zip, n), y = n)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Top 20 ZIP Codes by 311 Complaints", x = "", y = "Count") +
theme_minimal()# ---------- 8) Housing Violations Timeline ----------
housing |>
filter(!is.na(inspectiondate)) |>
count(inspectiondate) |>
ggplot(aes(x = inspectiondate, y = n)) +
geom_line(color = "red") +
labs(title = "Housing Violations Timeline", x = "Inspection Date", y = "Count") +
theme_minimal()# ---------- 9) 311 Complaints Timeline ----------
sr311 |>
filter(!is.na(created_date)) |>
count(created_date) |>
ggplot(aes(x = created_date, y = n)) +
geom_line(color = "orange") +
labs(title = "311 Complaints Timeline", x = "Created Date", y = "Count") +
theme_minimal()# ---------- 10) Monthly Aggregated Housing vs 311 ----------
housing_monthly <- housing |>
filter(!is.na(inspectiondate)) |>
mutate(month = floor_date(inspectiondate, "month")) |>
group_by(month) |>
summarise(HousingCount = n(), .groups = "drop")
sr311_monthly <- sr311 |>
filter(!is.na(created_date)) |>
mutate(month = floor_date(created_date, "month")) |>
group_by(month) |>
summarise(ComplaintCount = n(), .groups = "drop")
monthly_combined <- housing_monthly |>
left_join(sr311_monthly, by = "month") |>
pivot_longer(cols = c(HousingCount, ComplaintCount), names_to = "Source", values_to = "Count")
monthly_combined |>
ggplot(aes(axis1 = month, axis2 = Source, y = Count)) +
geom_alluvium(aes(fill = Source), width = 0.25) +
geom_stratum(width = 0.25) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
theme_minimal() +
labs(title = "Monthly Counts: Housing Violations vs 311 Complaints", y = "Count")# ---------- 11) Alluvial: Housing Class -> NOV Type ----------
housing |>
filter(!is.na(Class), !is.na(NovType)) |>
count(Class, NovType) |>
ggplot(aes(axis1 = Class, axis2 = NovType, y = n)) +
geom_alluvium(aes(fill = Class), width = 0.25) +
geom_stratum(width = 0.25) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
theme_minimal() +
labs(title = "Housing Violations: Class → NOV Type", y = "Count")# ---------- 12) Top Housing Violation Classes ----------
housing |>
filter(!is.na(Class)) |>
count(Class) |>
slice_max(n, n = 15) |>
ggplot(aes(x = reorder(Class, n), y = n)) +
geom_col(fill = "darkred") +
coord_flip() +
labs(title = "Top 15 Housing Violation Classes (Manhattan, Last Year)", x = "", y = "Count") +
theme_minimal()# library(dplyr)
# library(ggplot2)
# library(ggalluvial)
# # Only keep rows where at least inspectiondate exists
# viol_comparison <- housing |>
# select(ViolationID, inspectiondate, approveddate, certifieddate, currentstatusdate) |>
# filter(!is.na(inspectiondate)) |>
# pivot_longer(
# cols = c(inspectiondate, approveddate, certifieddate, currentstatusdate),
# names_to = "Stage",
# values_to = "Date"
# ) |>
# filter(!is.na(Date)) |>
# mutate(Stage = factor(Stage,
# levels = c("inspectiondate", "approveddate", "certifieddate", "currentstatusdate"),
# labels = c("Inspection", "Approved", "Certified", "Final Status")
# ))
#
# # Check the first few rows
# head(viol_comparison)
# # Alluvial plot
# ggplot(viol_comparison,
# aes(axis1 = Stage, axis2 = Date, y = 1)) +
# geom_alluvium(aes(fill = Stage), width = 0.2, alpha = 0.7) +
# geom_stratum(width = 0.2) +
# geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5) +
# theme_minimal() +
# labs(
# title = "Housing Violations: Flow Across Key Dates (Manhattan, Last Year)",
# x = "Stage",
# y = "Count"
# )Code with last 3 years Data:
library(data.table)
# ========== LOAD FINAL DATASETS ==========
cat("========== LOADING FINAL DATASETS ==========\n\n")========== LOADING FINAL DATASETS ==========
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")
# ========== HOUSING VIOLATIONS ==========
cat("===== DATASET 1: HOUSING VIOLATIONS (2022+) =====\n\n")===== DATASET 1: HOUSING VIOLATIONS (2022+) =====
cat("Dimensions:\n")Dimensions:
cat("Rows:", nrow(housing_raw), "\n")Rows: 589005
cat("Columns:", ncol(housing_raw), "\n\n")Columns: 41
cat("Column names:\n")Column names:
print(names(housing_raw)) [1] "ViolationID" "BuildingID" "RegistrationID"
[4] "BoroID" "Borough" "HouseNumber"
[7] "LowHouseNumber" "HighHouseNumber" "StreetName"
[10] "StreetCode" "Postcode" "Apartment"
[13] "Story" "Block" "Lot"
[16] "Class" "InspectionDate" "ApprovedDate"
[19] "OriginalCertifyByDate" "OriginalCorrectByDate" "NewCertifyByDate"
[22] "NewCorrectByDate" "CertifiedDate" "OrderNumber"
[25] "NOVID" "NOVDescription" "NOVIssuedDate"
[28] "CurrentStatusID" "CurrentStatus" "CurrentStatusDate"
[31] "NovType" "ViolationStatus" "RentImpairing"
[34] "Latitude" "Longitude" "CommunityBoard"
[37] "CouncilDistrict" "CensusTract" "BIN"
[40] "BBL" "NTA"
cat("\n")cat("Data types:\n")Data types:
print(sapply(housing_raw, class))$ViolationID
[1] "integer"
$BuildingID
[1] "integer"
$RegistrationID
[1] "integer"
$BoroID
[1] "integer"
$Borough
[1] "character"
$HouseNumber
[1] "character"
$LowHouseNumber
[1] "character"
$HighHouseNumber
[1] "character"
$StreetName
[1] "character"
$StreetCode
[1] "integer"
$Postcode
[1] "integer"
$Apartment
[1] "character"
$Story
[1] "integer"
$Block
[1] "integer"
$Lot
[1] "character"
$Class
[1] "character"
$InspectionDate
[1] "IDate" "Date"
$ApprovedDate
[1] "character"
$OriginalCertifyByDate
[1] "character"
$OriginalCorrectByDate
[1] "character"
$NewCertifyByDate
[1] "character"
$NewCorrectByDate
[1] "character"
$CertifiedDate
[1] "character"
$OrderNumber
[1] "integer"
$NOVID
[1] "integer"
$NOVDescription
[1] "character"
$NOVIssuedDate
[1] "character"
$CurrentStatusID
[1] "integer"
$CurrentStatus
[1] "character"
$CurrentStatusDate
[1] "character"
$NovType
[1] "character"
$ViolationStatus
[1] "character"
$RentImpairing
[1] "character"
$Latitude
[1] "numeric"
$Longitude
[1] "numeric"
$CommunityBoard
[1] "integer"
$CouncilDistrict
[1] "integer"
$CensusTract
[1] "integer"
$BIN
[1] "integer"
$BBL
[1] "integer"
$NTA
[1] "character"
cat("\n")cat("First 5 records:\n")First 5 records:
print(head(housing_raw, 5)) ViolationID BuildingID RegistrationID BoroID Borough HouseNumber
<int> <int> <int> <int> <char> <char>
1: 15004789 805012 100420 1 MANHATTAN 144REAR
2: 14817075 9458 104092 1 MANHATTAN 1
3: 14831546 40441 118126 1 MANHATTAN 158
4: 15313781 18016 120064 1 MANHATTAN 314
5: 14838558 8157 118733 1 MANHATTAN 3885
LowHouseNumber HighHouseNumber StreetName StreetCode Postcode
<char> <char> <char> <int> <int>
1: 144rear 146 REAR LUDLOW STREET 25090 10002
2: 1 7 CHRISTOPHER STREET 15210 10014
3: 158 158 WEST 132 STREET 36410 10027
4: 314 314 EAST 89 STREET 18770 10128
5: 3885 3897 BROADWAY 13610 10032
Apartment Story Block Lot Class InspectionDate ApprovedDate
<char> <int> <int> <char> <char> <IDat> <char>
1: NA 411 45 A 2022-03-08 03/08/2022
2: 9C 9 610 60 A 2022-02-10 02/10/2022
3: 3N 3 1916 55 B 2022-02-11 02/11/2022
4: 0 1551 43 C 2022-08-16 08/17/2022
5: 20B 2 2137 80 B 2022-02-14 02/15/2022
OriginalCertifyByDate OriginalCorrectByDate NewCertifyByDate
<char> <char> <char>
1: 06/25/2022 06/11/2022
2: 05/31/2022 05/17/2022
3: 04/04/2022 03/21/2022
4: 09/06/2022 09/01/2022
5: 04/06/2022 03/23/2022
NewCorrectByDate CertifiedDate OrderNumber NOVID
<char> <char> <int> <int>
1: 1507 7793231
2: 554 7636476
3: 579 7638268
4: 12/05/2022 530 8007840
5: 03/18/2022 579 7651705
NOVDescription
<char>
1: (A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS.
2: § 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH
3: § 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST
4: § 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD
5: § 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST
NOVIssuedDate CurrentStatusID CurrentStatus CurrentStatusDate NovType
<char> <int> <char> <char> <char>
1: 03/08/2022 9 VIOLATION DISMISSED 03/23/2022 Original
2: 02/11/2022 19 VIOLATION CLOSED 03/06/2024 Original
3: 02/14/2022 2 NOV SENT OUT 02/14/2022 Original
4: 08/18/2022 19 VIOLATION CLOSED 06/29/2023 Original
5: 02/16/2022 9 VIOLATION DISMISSED 05/31/2022 Original
ViolationStatus RentImpairing Latitude Longitude CommunityBoard
<char> <char> <num> <num> <int>
1: Close N NA NA NA
2: Close N 40.73407 -73.99976 2
3: Open N 40.81285 -73.94384 10
4: Close N 40.77955 -73.94975 8
5: Close N 40.83741 -73.94244 12
CouncilDistrict CensusTract BIN BBL NTA
<int> <int> <int> <int> <char>
1: NA NA NA NA
2: 3 71 1010692 1006100060 West Village
3: 9 226 1058117 1019160055 Harlem (North)
4: 5 14602 1050093 1015510043 Upper East Side-Yorkville
5: 7 245 1063363 1021370080 Washington Heights (South)
cat("\n")cat("Summary statistics:\n")Summary statistics:
print(summary(housing_raw[, .(ViolationID, BuildingID, InspectionDate,
CurrentStatus, ViolationStatus, RentImpairing,
Latitude, Longitude)])) ViolationID BuildingID InspectionDate CurrentStatus
Min. :14753195 Min. : 1 Min. :2022-01-01 Length:589005
1st Qu.:15642804 1st Qu.: 13158 1st Qu.:2022-12-28 Class :character
Median :16530701 Median : 27948 Median :2023-12-13 Mode :character
Mean :16557793 Mean : 101240 Mean :2023-12-13
3rd Qu.:17447307 3rd Qu.: 41552 3rd Qu.:2024-11-22
Max. :18445551 Max. :1017519 Max. :2025-11-23
ViolationStatus RentImpairing Latitude Longitude
Length:589005 Length:589005 Min. :40.70 Min. :-74.02
Class :character Class :character 1st Qu.:40.78 1st Qu.:-73.97
Mode :character Mode :character Median :40.81 Median :-73.95
Mean :40.80 Mean :-73.95
3rd Qu.:40.83 3rd Qu.:-73.94
Max. :40.88 Max. :-73.91
NA's :77 NA's :77
cat("\n")# ========== 311 HOUSING COMPLAINTS ==========
cat("\n===== DATASET 2: 311 HOUSING COMPLAINTS (2022+) =====\n\n")
===== DATASET 2: 311 HOUSING COMPLAINTS (2022+) =====
cat("Dimensions:\n")Dimensions:
cat("Rows:", nrow(sr311_raw ), "\n")Rows: 425248
cat("Columns:", ncol(sr311_raw ), "\n\n")Columns: 42
cat("Column names:\n")Column names:
print(names(sr311_raw )) [1] "Unique Key" "Created Date"
[3] "Closed Date" "Agency"
[5] "Agency Name" "Complaint Type"
[7] "Descriptor" "Location Type"
[9] "Incident Zip" "Incident Address"
[11] "Street Name" "Cross Street 1"
[13] "Cross Street 2" "Intersection Street 1"
[15] "Intersection Street 2" "Address Type"
[17] "City" "Landmark"
[19] "Facility Type" "Status"
[21] "Due Date" "Resolution Description"
[23] "Resolution Action Updated Date" "Community Board"
[25] "BBL" "Borough"
[27] "X Coordinate (State Plane)" "Y Coordinate (State Plane)"
[29] "Open Data Channel Type" "Park Facility Name"
[31] "Park Borough" "Vehicle Type"
[33] "Taxi Company Borough" "Taxi Pick Up Location"
[35] "Bridge Highway Name" "Bridge Highway Direction"
[37] "Road Ramp" "Bridge Highway Segment"
[39] "Latitude" "Longitude"
[41] "Location" "CreatedDate"
cat("\n")cat("Data types:\n")Data types:
print(sapply(sr311_raw , class))$`Unique Key`
[1] "integer"
$`Created Date`
[1] "character"
$`Closed Date`
[1] "character"
$Agency
[1] "character"
$`Agency Name`
[1] "character"
$`Complaint Type`
[1] "character"
$Descriptor
[1] "character"
$`Location Type`
[1] "character"
$`Incident Zip`
[1] "integer"
$`Incident Address`
[1] "character"
$`Street Name`
[1] "character"
$`Cross Street 1`
[1] "character"
$`Cross Street 2`
[1] "character"
$`Intersection Street 1`
[1] "character"
$`Intersection Street 2`
[1] "character"
$`Address Type`
[1] "character"
$City
[1] "character"
$Landmark
[1] "character"
$`Facility Type`
[1] "character"
$Status
[1] "character"
$`Due Date`
[1] "logical"
$`Resolution Description`
[1] "character"
$`Resolution Action Updated Date`
[1] "character"
$`Community Board`
[1] "character"
$BBL
[1] "integer"
$Borough
[1] "character"
$`X Coordinate (State Plane)`
[1] "character"
$`Y Coordinate (State Plane)`
[1] "character"
$`Open Data Channel Type`
[1] "character"
$`Park Facility Name`
[1] "character"
$`Park Borough`
[1] "character"
$`Vehicle Type`
[1] "logical"
$`Taxi Company Borough`
[1] "logical"
$`Taxi Pick Up Location`
[1] "logical"
$`Bridge Highway Name`
[1] "logical"
$`Bridge Highway Direction`
[1] "logical"
$`Road Ramp`
[1] "logical"
$`Bridge Highway Segment`
[1] "logical"
$Latitude
[1] "numeric"
$Longitude
[1] "numeric"
$Location
[1] "character"
$CreatedDate
[1] "IDate" "Date"
cat("\n")cat("First 5 records:\n")First 5 records:
print(head(sr311_raw , 5)) Unique Key Created Date Closed Date Agency
<int> <char> <char> <char>
1: 66934900 11/23/2025 11:58:19 PM HPD
2: 66932786 11/23/2025 11:55:32 PM HPD
3: 66931601 11/23/2025 11:52:04 PM HPD
4: 66929338 11/23/2025 11:51:49 PM HPD
5: 66933739 11/23/2025 11:48:51 PM HPD
Agency Name Complaint Type
<char> <char>
1: Department of Housing Preservation and Development HEAT/HOT WATER
2: Department of Housing Preservation and Development PLUMBING
3: Department of Housing Preservation and Development HEAT/HOT WATER
4: Department of Housing Preservation and Development HEAT/HOT WATER
5: Department of Housing Preservation and Development HEAT/HOT WATER
Descriptor Location Type Incident Zip Incident Address
<char> <char> <int> <char>
1: ENTIRE BUILDING RESIDENTIAL BUILDING 10028 425 EAST 84 STREET
2: BATHTUB/SHOWER RESIDENTIAL BUILDING 10040 121 FT GEORGE AVENUE
3: ENTIRE BUILDING RESIDENTIAL BUILDING 10040 121 FT GEORGE AVENUE
4: ENTIRE BUILDING RESIDENTIAL BUILDING 10028 425 EAST 84 STREET
5: ENTIRE BUILDING RESIDENTIAL BUILDING 10025 242 WEST 109 STREET
Street Name Cross Street 1 Cross Street 2 Intersection Street 1
<char> <char> <char> <char>
1: EAST 84 STREET
2: FT GEORGE AVENUE
3: FT GEORGE AVENUE
4: EAST 84 STREET
5: WEST 109 STREET
Intersection Street 2 Address Type City Landmark Facility Type Status
<char> <char> <char> <char> <char> <char>
1: ADDRESS NEW YORK Open
2: ADDRESS NEW YORK Open
3: ADDRESS NEW YORK Open
4: ADDRESS NEW YORK Open
5: ADDRESS NEW YORK Open
Due Date
<lgcl>
1: NA
2: NA
3: NA
4: NA
5: NA
Resolution Description
<char>
1: This complaint is a duplicate of a building-wide condition already reported by another tenant. The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment. If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
2: The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair. HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
3: This complaint is a duplicate of a building-wide condition already reported by another tenant. The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment. If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
4: The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair. HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
5: This complaint is a duplicate of a building-wide condition already reported by another tenant. The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment. If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
Resolution Action Updated Date Community Board BBL Borough
<char> <char> <int> <char>
1: 11/23/2025 12:00:00 AM 08 MANHATTAN 1015640012 MANHATTAN
2: 11/23/2025 12:00:00 AM 12 MANHATTAN 1021490265 MANHATTAN
3: 11/23/2025 12:00:00 AM 12 MANHATTAN 1021490265 MANHATTAN
4: 11/23/2025 12:00:00 AM 08 MANHATTAN 1015640012 MANHATTAN
5: 11/23/2025 12:00:00 AM 07 MANHATTAN 1018800053 MANHATTAN
X Coordinate (State Plane) Y Coordinate (State Plane) Open Data Channel Type
<char> <char> <char>
1: 998,207 221,730 ONLINE
2: 1,004,446 251,683 PHONE
3: 1,004,446 251,683 PHONE
4: 998,207 221,730 ONLINE
5: 993,732 231,824 PHONE
Park Facility Name Park Borough Vehicle Type Taxi Company Borough
<char> <char> <lgcl> <lgcl>
1: Unspecified MANHATTAN NA NA
2: Unspecified MANHATTAN NA NA
3: Unspecified MANHATTAN NA NA
4: Unspecified MANHATTAN NA NA
5: Unspecified MANHATTAN NA NA
Taxi Pick Up Location Bridge Highway Name Bridge Highway Direction Road Ramp
<lgcl> <lgcl> <lgcl> <lgcl>
1: NA NA NA NA
2: NA NA NA NA
3: NA NA NA NA
4: NA NA NA NA
5: NA NA NA NA
Bridge Highway Segment Latitude Longitude
<lgcl> <num> <num>
1: NA 40.77526 -73.94961
2: NA 40.85746 -73.92699
3: NA 40.85746 -73.92699
4: NA 40.77526 -73.94961
5: NA 40.80297 -73.96575
Location CreatedDate
<char> <IDat>
1: (40.77526292188198, -73.94960785720805) 2025-11-23
2: (40.85746342279737, -73.92699164532567) 2025-11-23
3: (40.85746342279737, -73.92699164532567) 2025-11-23
4: (40.77526292188198, -73.94960785720805) 2025-11-23
5: (40.80297417373674, -73.96575071974307) 2025-11-23
cat("\n")cat("Summary statistics:\n")Summary statistics:
print(summary(sr311_raw [, .(`Unique Key`, `Created Date`, `Closed Date`,
Agency, `Complaint Type`, Status,
Latitude, Longitude)])) Unique Key Created Date Closed Date Agency
Min. :52939155 Length:425248 Length:425248 Length:425248
1st Qu.:56470870 Class :character Class :character Class :character
Median :59938206 Mode :character Mode :character Mode :character
Mean :59892593
3rd Qu.:63464249
Max. :66935006
Complaint Type Status Latitude Longitude
Length:425248 Length:425248 Min. :40.69 Min. :-74.02
Class :character Class :character 1st Qu.:40.77 1st Qu.:-73.97
Mode :character Mode :character Median :40.81 Median :-73.95
Mean :40.80 Mean :-73.96
3rd Qu.:40.83 3rd Qu.:-73.94
Max. :40.88 Max. :-73.91
cat("\n")# ========== KEY DISTRIBUTIONS ==========
cat("\n===== KEY DISTRIBUTIONS =====\n\n")
===== KEY DISTRIBUTIONS =====
cat("Housing Violations - Top 10 Current Status:\n")Housing Violations - Top 10 Current Status:
print(housing_raw[, .N, by=CurrentStatus][order(-N)][1:10]) CurrentStatus N
<char> <int>
1: VIOLATION DISMISSED 197906
2: VIOLATION CLOSED 180829
3: NOV SENT OUT 112090
4: FIRST NO ACCESS TO RE- INSPECT VIOLATION 18629
5: NOT COMPLIED WITH 17976
6: INFO NOV SENT OUT 12927
7: NOTICE OF ISSUANCE SENT TO TENANT 11160
8: VIOLATION WILL BE REINSPECTED 10263
9: CIV14 MAILED 7244
10: DEFECT LETTER ISSUED 5934
cat("\n")cat("311 Complaints - Top 10 Complaint Types:\n")311 Complaints - Top 10 Complaint Types:
print(sr311_raw [, .N, by=`Complaint Type`][order(-N)][1:10]) Complaint Type N
<char> <int>
1: HEAT/HOT WATER 229222
2: PLUMBING 50637
3: PAINT/PLASTER 44624
4: WATER LEAK 30190
5: General Construction/Plumbing 25207
6: Maintenance or Facility 20202
7: Elevator 16588
8: Plumbing 2197
9: ELEVATOR 1302
10: Non-Residential Heat 1094
cat("\n")cat("311 Complaints - Status distribution:\n")311 Complaints - Status distribution:
print(sr311_raw [, .N, by=Status][order(-N)]) Status N
<char> <int>
1: Closed 419047
2: Open 5152
3: In Progress 922
4: Assigned 124
5: Unspecified 3
cat("\n")cat("311 Complaints - Top Agencies:\n")311 Complaints - Top Agencies:
print(sr311_raw [, .N, by=Agency][order(-N)][1:10]) Agency N
<char> <int>
1: HPD 358603
2: DOB 43687
3: DPR 20202
4: DOHMH 1579
5: DOE 1064
6: DEP 99
7: DOT 14
8: <NA> NA
9: <NA> NA
10: <NA> NA
library(data.table)
# Just load the filtered datasets we already have
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")
cat("Working with FULL datasets:\n")Working with FULL datasets:
cat("Housing Violations:", nrow(housing_raw), "rows x", ncol(housing_raw), "columns\n")Housing Violations: 589005 rows x 41 columns
cat("311 Complaints:", nrow(sr311_raw ), "rows x", ncol(sr311_raw ), "columns\n")311 Complaints: 425248 rows x 42 columns
library(data.table)
library(ggplot2)
set.seed(2025)
# Load datasets
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")
# ========== MISSING VALUE ANALYSIS ==========
# === HOUSING VIOLATIONS ===
cat("===== HOUSING VIOLATIONS - MISSING VALUES =====\n\n")===== HOUSING VIOLATIONS - MISSING VALUES =====
# Calculate missing counts and percentages
missing_viol <- data.table(
Column = names(housing_raw),
Missing_Count = sapply(housing_raw, function(x) sum(is.na(x) | x == "")),
Total_Rows = nrow(housing_raw)
)
missing_viol[, Missing_Pct := round(Missing_Count / Total_Rows * 100, 2)]
missing_viol <- missing_viol[order(-Missing_Pct)]
cat("Columns with missing values:\n")Columns with missing values:
print(missing_viol[Missing_Pct > 0]) Column Missing_Count Total_Rows Missing_Pct
<char> <int> <int> <num>
1: NewCertifyByDate 585410 589005 99.39
2: NewCorrectByDate 585410 589005 99.39
3: CertifiedDate 360194 589005 61.15
4: Apartment 198543 589005 33.71
5: Story 77094 589005 13.09
6: OriginalCertifyByDate 34593 589005 5.87
7: OriginalCorrectByDate 34593 589005 5.87
8: NOVID 34593 589005 5.87
9: NOVIssuedDate 34593 589005 5.87
10: NovType 34593 589005 5.87
11: BIN 305 589005 0.05
12: BBL 305 589005 0.05
13: Postcode 147 589005 0.02
14: Latitude 77 589005 0.01
15: Longitude 77 589005 0.01
16: CommunityBoard 77 589005 0.01
17: CouncilDistrict 77 589005 0.01
18: CensusTract 77 589005 0.01
19: NTA 77 589005 0.01
cat("\nColumns with >50% missing:\n")
Columns with >50% missing:
print(missing_viol[Missing_Pct > 50]) Column Missing_Count Total_Rows Missing_Pct
<char> <int> <int> <num>
1: NewCertifyByDate 585410 589005 99.39
2: NewCorrectByDate 585410 589005 99.39
3: CertifiedDate 360194 589005 61.15
# === 311 COMPLAINTS ===
cat("\n\n===== 311 HOUSING COMPLAINTS - MISSING VALUES =====\n\n")
===== 311 HOUSING COMPLAINTS - MISSING VALUES =====
# Calculate missing counts and percentages
missing_311 <- data.table(
Column = names(sr311_raw ),
Missing_Count = sapply(sr311_raw , function(x) sum(is.na(x) | x == "")),
Total_Rows = nrow(sr311_raw )
)
missing_311[, Missing_Pct := round(Missing_Count / Total_Rows * 100, 2)]
missing_311 <- missing_311[order(-Missing_Pct)]
cat("Columns with missing values:\n")Columns with missing values:
print(missing_311[Missing_Pct > 0]) Column Missing_Count Total_Rows Missing_Pct
<char> <int> <int> <num>
1: Due Date 425248 425248 100.00
2: Vehicle Type 425248 425248 100.00
3: Taxi Company Borough 425248 425248 100.00
4: Taxi Pick Up Location 425248 425248 100.00
5: Bridge Highway Name 425248 425248 100.00
6: Bridge Highway Direction 425248 425248 100.00
7: Road Ramp 425248 425248 100.00
8: Bridge Highway Segment 425248 425248 100.00
9: Facility Type 417587 425248 98.20
10: Cross Street 2 400618 425248 94.21
11: Intersection Street 2 400618 425248 94.21
12: Cross Street 1 400546 425248 94.19
13: Intersection Street 1 400546 425248 94.19
14: Landmark 400506 425248 94.18
15: Location Type 43668 425248 10.27
16: Closed Date 5151 425248 1.21
17: Resolution Description 2417 425248 0.57
18: BBL 2302 425248 0.54
19: Resolution Action Updated Date 902 425248 0.21
cat("\nColumns with >50% missing:\n")
Columns with >50% missing:
print(missing_311[Missing_Pct > 50]) Column Missing_Count Total_Rows Missing_Pct
<char> <int> <int> <num>
1: Due Date 425248 425248 100.00
2: Vehicle Type 425248 425248 100.00
3: Taxi Company Borough 425248 425248 100.00
4: Taxi Pick Up Location 425248 425248 100.00
5: Bridge Highway Name 425248 425248 100.00
6: Bridge Highway Direction 425248 425248 100.00
7: Road Ramp 425248 425248 100.00
8: Bridge Highway Segment 425248 425248 100.00
9: Facility Type 417587 425248 98.20
10: Cross Street 2 400618 425248 94.21
11: Intersection Street 2 400618 425248 94.21
12: Cross Street 1 400546 425248 94.19
13: Intersection Street 1 400546 425248 94.19
14: Landmark 400506 425248 94.18
# Save results for visualization
fwrite(missing_viol, "datasets/missing_values_violations.csv")
fwrite(missing_311, "datasets/missing_values_311.csv")
cat("\n\nMissing value summaries saved!\n")
Missing value summaries saved!
Missing Value Analysis - Code
library(data.table)
library(ggplot2)
set.seed(2025)
# Load missing value summaries
missing_viol <- fread("datasets/missing_values_violations.csv")
missing_311 <- fread("datasets/missing_values_311.csv")
# ========== GRAPH 1: BAR CHART - HOUSING VIOLATIONS ==========
# Filter to columns with >0% missing
missing_viol_plot <- missing_viol[Missing_Pct > 0]
# Create bar chart
p1 <- ggplot(missing_viol_plot, aes(x = reorder(Column, Missing_Pct), y = Missing_Pct)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
title = "Missing Values in Housing Violations Dataset",
subtitle = "Manhattan, 2022-2025",
x = "Column Name",
y = "Percentage Missing (%)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text.y = element_text(size = 11)
)
print(p1)# ========== GRAPH 2: BAR CHART - 311 COMPLAINTS ==========
# Filter to columns with >0% missing
missing_311_plot <- missing_311[Missing_Pct > 0]
# Create bar chart
p2 <- ggplot(missing_311_plot, aes(x = reorder(Column, Missing_Pct), y = Missing_Pct)) +
geom_bar(stat = "identity", fill = "coral") +
coord_flip() +
labs(
title = "Missing Values in 311 Housing Complaints Dataset",
subtitle = "Manhattan, 2022-2025",
x = "Column Name",
y = "Percentage Missing (%)"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold", size = 16),
axis.text.y = element_text(size = 11)
)
print(p2)# ========== SUMMARY STATISTICS ==========
cat("\n===== KEY FINDINGS =====\n\n")
===== KEY FINDINGS =====
cat("HOUSING VIOLATIONS:\n")HOUSING VIOLATIONS:
cat("- Total columns:", nrow(missing_viol), "\n")- Total columns: 41
cat("- Columns with missing data:", nrow(missing_viol[Missing_Pct > 0]), "\n")- Columns with missing data: 19
cat("- Columns >50% missing:", nrow(missing_viol[Missing_Pct > 50]), "\n")- Columns >50% missing: 3
cat("- Columns 100% complete:", nrow(missing_viol[Missing_Pct == 0]), "\n\n")- Columns 100% complete: 21
cat("311 COMPLAINTS:\n")311 COMPLAINTS:
cat("- Total columns:", nrow(missing_311), "\n")- Total columns: 42
cat("- Columns with missing data:", nrow(missing_311[Missing_Pct > 0]), "\n")- Columns with missing data: 19
cat("- Columns >50% missing:", nrow(missing_311[Missing_Pct > 50]), "\n")- Columns >50% missing: 14
cat("- Columns 100% missing:", nrow(missing_311[Missing_Pct == 100]), "\n")- Columns 100% missing: 8
cat("- Columns 100% complete:", nrow(missing_311[Missing_Pct == 0]), "\n")- Columns 100% complete: 22
2.2 Missing Value Analysis
The missing value patterns in both datasets reveal important characteristics about data collection and reporting practices.
Housing Violations Dataset
The Housing Violations dataset contains 589,005 records with 41 columns. Of these, 19 columns have missing values while 22 columns are complete. Three columns have more than 50% missing data:
- NewCertifyByDate and NewCorrectByDate (99.4% missing): These fields are only populated when violations are re-certified or correction deadlines are extended, indicating that most violations follow the original timeline without modifications.
- CertifiedDate (61.2% missing): This high percentage reflects that many violations remain open or were dismissed without certification, consistent with the status distribution showing significant numbers of dismissed and uncertified violations.
Additional notable patterns include 33.7% missing apartment numbers, likely representing violations that affect entire buildings or common areas rather than specific units. Geographic fields (Latitude, Longitude, Community Board) show minimal missingness (<0.02%), demonstrating excellent geocoding coverage for Manhattan addresses.
311 Housing Complaints Dataset
The 311 dataset contains 425,248 records with 42 columns. Eight columns are completely empty (100% missing), all related to transportation categories (taxi, vehicle, bridge/highway fields) that are irrelevant to housing complaints and can be excluded from analysis.
Cross-street and intersection fields show 94% missingness, as most complaints use address-based location rather than intersection descriptions. The Facility Type field (98.2% missing) and Landmark field (94.2% missing) are rarely used for residential complaints.
Critical fields for analysis have minimal missingness: BBL (0.54%), Closed Date (1.21% - only open cases), and Resolution Description (0.57%), ensuring robust data availability for investigating complaint resolution patterns and building-level aggregation.
Enhanced Missing Value Analysis - Add Heatmap
The error is because pivot_longer() can’t combine different data types (integers, characters, dates, etc.) into one column. We need to convert everything to character first. Here’s the fix:
library(data.table)
library(tidyverse)
# Load datasets
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")
# ========== MISSING VALUE HEATMAP - HOUSING VIOLATIONS ==========
# Convert data.table to tibble and make all columns character for pivoting
viol_long <- as_tibble(housing_raw) |>
mutate(across(everything(), as.character)) |> # Convert all to character
rownames_to_column("row_id") |>
pivot_longer(cols = -row_id, names_to = "name", values_to = "value") |>
mutate(missing = ifelse(is.na(value) | value == "", "yes", "no"))
# Plot heatmap - sample rows for visibility
ggplot(viol_long |> filter(as.numeric(row_id) <= 1000), # First 1000 rows only
aes(x = name, y = fct_rev(row_id), fill = missing)) +
geom_tile(color = "white") +
scale_fill_viridis_d() +
labs(
title = "Missing Data Pattern: Housing Violations (sample)",
subtitle = "First 1000 records, Manhattan 2022-2025",
x = "Column",
y = "Row"
) +
theme_bw(base_size = 14) +
theme(
axis.text.y = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1)
)# ========== MISSING VALUE HEATMAP - 311 COMPLAINTS ==========
complaints_long <- as_tibble(sr311_raw ) |>
mutate(across(everything(), as.character)) |> # Convert all to character
rownames_to_column("row_id") |>
pivot_longer(cols = -row_id, names_to = "name", values_to = "value") |>
mutate(missing = ifelse(is.na(value) | value == "", "yes", "no"))
# Plot heatmap - sample rows
ggplot(complaints_long |> filter(as.numeric(row_id) <= 1000), # First 1000 rows
aes(x = name, y = fct_rev(row_id), fill = missing)) +
geom_tile(color = "white") +
scale_fill_viridis_d() +
labs(
title = "Missing Data Pattern: 311 Complaints (sample)",
subtitle = "First 1000 records, Manhattan 2022-2025",
x = "Column",
y = "Row"
) +
theme_bw(base_size = 14) +
theme(
axis.text.y = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1)
)EXCELLENT POINT! You’re absolutely right to question this!
Why Random Sampling? The heatmap with 589,005 rows would be:
Computationally expensive (slow to render)
Visually useless - you can’t see individual tiles when there are 589K tiny rows on the y-axis
The image would be a giant blur
BUT - you already have the complete missing value analysis in your bar charts! Those show the exact percentages for all rows.
Better Question: Do You Even Need the Heatmap? Looking at the lecture slides you shared, the heatmap is useful when:
Small dataset (like mtcars with 32 rows)
You want to see patterns of missingness (e.g., “if column A is missing, column B is also missing”)
For your large datasets, the bar charts you already made are BETTER because: ✅ Show exact percentages ✅ Easy to read ✅ Already done and look great!
library(redav)
# Simple aggregated missing plot
plot_missing(housing_raw, percent = TRUE)plot_missing(sr311_raw , percent = TRUE)library(redav)
# ========== HOUSING VIOLATIONS - FIXED ==========
plot_missing(housing_raw,
percent = TRUE,
num_char = 5, max_cols = 10) # Shorten column names to 8 characters# Alternative: Manually create cleaner version
# First, let's see how many columns we're dealing with
cat("Number of columns:", ncol(housing_raw))Number of columns: 41
library(data.table)
library(ggplot2)
# Load missing value summary
missing_viol <- fread("datasets/missing_values_violations.csv")
missing_311 <- fread("datasets/missing_values_311.csv")
# Filter to only columns with significant missingness (>5%)
missing_viol_sig <- missing_viol[Missing_Pct > 5]
missing_311_sig <- missing_311[Missing_Pct > 5]
# Create subset datasets with only these columns
cols_to_keep_viol <- missing_viol_sig$Column
cols_to_keep_311 <- missing_311_sig$Column
dt_viol_subset <- housing_raw[, ..cols_to_keep_viol]
sr311_raw_subset <- sr311_raw[, ..cols_to_keep_311]
# Now plot with fewer columns (more readable)
plot_missing(dt_viol_subset, percent = TRUE)plot_missing(sr311_raw_subset, percent = TRUE)ask Prof about formatting or is that even required ?
Research Question: When do housing violations occur? Are there seasonal patterns (e.g., more heating violations in winter)?
What we’re trying to find:
Do violations increase during certain months?
Are there yearly trends?
Any COVID-related changes?
Graph type: Time series line chart
library(data.table)
library(ggplot2)
set.seed(2025)
# Load data
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
# Convert date and extract time components
housing_raw[, InspectionDate := as.Date(InspectionDate)]
housing_raw[, YearMonth := format(InspectionDate, "%Y-%m")]
housing_raw[, Month := month(InspectionDate)]
housing_raw[, Year := year(InspectionDate)]
# Count violations by month and year
viol_by_month <- housing_raw[, .N, by = .(Year, Month)][order(Year, Month)]
# Create the plot
ggplot(viol_by_month, aes(x = Month, y = N, color = factor(Year), group = Year)) +
geom_line(linewidth = 1.2) +
geom_point(size = 2.5) +
scale_x_continuous(breaks = 1:12,
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Housing Violations by Month and Year",
subtitle = "Manhattan, 2022-2025",
x = "Month",
y = "Number of Violations",
color = "Year"
) +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "right"
)# Investigate February 2022
feb_2022 <- housing_raw[Year == 2022 & Month == 2]
cat("February 2022 violations:", nrow(feb_2022), "\n")February 2022 violations: 28414
# Check if they're all inspected in Feb or entered in Feb
summary(feb_2022$InspectionDate) Min. 1st Qu. Median Mean 3rd Qu. Max.
"2022-02-01" "2022-02-15" "2022-02-15" "2022-02-15" "2022-02-16" "2022-02-28"
summary(feb_2022$ApprovedDate) Length Class Mode
28414 character character
🔍 What the Data Reveals: February 2022 has 28,414 violations - and looking at the dates:
Median/Mean inspection date: February 15, 2022
Most inspections clustered around Feb 15-16
This is NOT a data error - this appears to be a mass inspection event or systematic sweep by HPD in mid-February 2022.
Possible Explanations: Annual inspection cycle - HPD may conduct building-wide inspections at this time
Post-winter inspection sweep - checking heating violations after winter
Administrative/policy change - new enforcement initiative started
Backlog processing - accumulated complaints processed together
Updated Commentary for Graph: “Housing violations show notable spikes in early 2022 (particularly February with ~29,000 violations), likely reflecting systematic inspection efforts by HPD. Seasonal patterns are less pronounced than expected, with violations remaining relatively stable across months at 10,000-12,000 per month after the initial 2022 surge. The 2025 data shows incomplete year-to-date figures through November. This suggests violations are driven more by inspection scheduling and enforcement policy than by seasonal factors like winter heating issues.”
Graph 2: Smoothed Temporal View Research Question: What are the overall trends when we smooth out monthly fluctuations?
library(data.table)
library(ggplot2)
# Aggregate by quarter for smoother trend
housing_raw[, Quarter := quarter(InspectionDate)]
housing_raw[, YearQuarter := paste0(Year, "-Q", Quarter)]
viol_by_quarter <- housing_raw[, .N, by = .(Year, Quarter, YearQuarter)][order(Year, Quarter)]
# Create quarter labels
viol_by_quarter[, QuarterLabel := paste0(Year, "\nQ", Quarter)]
# Plot
ggplot(viol_by_quarter, aes(x = factor(YearQuarter, levels = YearQuarter),
y = N, group = 1)) +
geom_line(color = "steelblue", linewidth = 1.2) +
geom_point(color = "steelblue", size = 3) +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Housing Violations by Quarter",
subtitle = "Manhattan, 2022-2025 (smoothed trend)",
x = "Quarter",
y = "Number of Violations"
) +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
)The quarterly graph shows a much clearer trend - violations started high in Q1 2022 (~48K), dropped, then fluctuated between 30K-47K per quarter, with a recent decline in Q4 2025.
Key Patterns: Sharp Drop After Q1 2022
Q1 2022: ~48,000 violations
Q2 2022: ~30,000 violations (37% decrease)
This confirms the Feb 2022 spike was a one-time event
Cyclical Pattern
Violations rise and fall in waves
Peaks: Q4 2022 (~41K), Q2 2024 (~47K), Q3 2025 (~41K)
Troughs: Q2 2022 (~30K), Q1 2025 (~30K)
No Clear Seasonality
No consistent “winter spike” for heating violations
Fluctuations appear more related to enforcement cycles than seasons
Recent Decline
Q4 2025: ~26,000 (lowest point)
Could be incomplete data (we’re currently in November 2025)
OR enforcement has genuinely decreased
Graph 3: What Types of Violations Are Most Common? Research Question: What are the most common housing violations that tenants and inspectors encounter?
library(data.table)
library(ggplot2)
library(stringr)
# Check what violation type columns we have
cat("Checking violation classification columns:\n")Checking violation classification columns:
cat("CurrentStatus values:\n")CurrentStatus values:
print(head(housing_raw[, .N, by=CurrentStatus][order(-N)], 10)) CurrentStatus N
<char> <int>
1: VIOLATION DISMISSED 197906
2: VIOLATION CLOSED 180829
3: NOV SENT OUT 112090
4: FIRST NO ACCESS TO RE- INSPECT VIOLATION 18629
5: NOT COMPLIED WITH 17976
6: INFO NOV SENT OUT 12927
7: NOTICE OF ISSUANCE SENT TO TENANT 11160
8: VIOLATION WILL BE REINSPECTED 10263
9: CIV14 MAILED 7244
10: DEFECT LETTER ISSUED 5934
cat("\n\nViolationStatus values:\n")
ViolationStatus values:
print(head(housing_raw[, .N, by=ViolationStatus][order(-N)], 10)) ViolationStatus N
<char> <int>
1: Close 386382
2: Open 202623
cat("\n\nClass values:\n")
Class values:
print(head(housing_raw[, .N, by=Class][order(-N)], 10)) Class N
<char> <int>
1: B 237753
2: C 184746
3: A 132840
4: I 33666
cat("\n\nRentImpairing values:\n")
RentImpairing values:
print(housing_raw[, .N, by=RentImpairing]) RentImpairing N
<char> <int>
1: N 542652
2: Y 46353
# NOVDescription is too detailed, let's extract key violation types
# Show a few examples first
cat("\n\nSample NOVDescription:\n")
Sample NOVDescription:
print(head(housing_raw$NOVDescription, 5))[1] "(A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS."
[2] "§ 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH"
[3] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST"
[4] "§ 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD"
[5] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST"
Graph 3: Violation Severity and Status Research Question: What types of violations are most common, and how seriously are they being addressed?
library(data.table)
library(ggplot2)
# === GRAPH 3A: Violation Class Distribution ===
# Class definitions: A=Non-Hazardous, B=Hazardous, C=Immediately Hazardous, I=Failure to Register
class_data <- housing_raw[, .N, by=Class][order(-N)]
# Add descriptive labels
class_data[, ClassLabel := fcase(
Class == "A", "Class A: Non-Hazardous",
Class == "B", "Class B: Hazardous",
Class == "C", "Class C: Immediately Hazardous",
Class == "I", "Class I: Failure to Register",
default = Class
)]
ggplot(class_data, aes(x = reorder(ClassLabel, N), y = N, fill = Class)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::comma(N)), hjust = -0.1, size = 5) +
coord_flip() +
scale_fill_manual(values = c("A" = "#90EE90", "B" = "#FFD700",
"C" = "#FF6347", "I" = "#87CEEB")) +
scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Housing Violations by Severity Class",
subtitle = "Manhattan, 2022-2025",
x = "Violation Class",
y = "Number of Violations",
caption = "Class A: Non-hazardous | Class B: Hazardous | Class C: Immediately Hazardous"
) +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "none"
)# === GRAPH 3B: Rent-Impairing Violations ===
rent_data <- housing_raw[, .N, by=RentImpairing]
rent_data[, Label := ifelse(RentImpairing == "Y",
"Rent-Impairing\n(affects habitability)",
"Non-Rent-Impairing")]
ggplot(rent_data, aes(x = Label, y = N, fill = RentImpairing)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(scales::comma(N), "\n(",
round(N/sum(N)*100, 1), "%)")),
vjust = -0.5, size = 5) +
scale_fill_manual(values = c("N" = "steelblue", "Y" = "darkred")) +
scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1))) +
labs(
title = "Rent-Impairing vs Non-Rent-Impairing Violations",
subtitle = "Manhattan, 2022-2025",
x = "",
y = "Number of Violations"
) +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "none"
)Excellent! Both graphs look great! Let me analyze them:
📊 Graph 3A: Violation Severity Analysis Key Findings:
Class B (Hazardous): 237,753 violations (40.4%) - MOST COMMON
Class C (Immediately Hazardous): 184,746 (31.4%) - VERY SERIOUS
Class A (Non-Hazardous): 132,840 (22.6%)
Class I (Failure to Register): 33,666 (5.7%)
Commentary: “Over 70% of violations are classified as hazardous (Class B) or immediately hazardous (Class C), indicating serious safety and habitability concerns in Manhattan housing. Class B violations, which require correction within 30 days, dominate the landscape. This high proportion of serious violations underscores the importance of thorough building inspection before renting.”
📊 Graph 3B: Rent-Impairing Analysis Key Findings:
Non-Rent-Impairing: 542,652 (92.1%) - Vast majority
Rent-Impairing: 46,353 (7.9%) - Serious habitability issues
Commentary: “Only 7.9% of violations are classified as rent-impairing, meaning they directly affect habitability and could legally justify rent withholding. While most violations are non-rent-impairing, the 46,000+ rent-impairing violations represent conditions severe enough to make apartments legally uninhabitable, such as lack of heat, hot water, or structural hazards.”
Cleaning
library(data.table)
library(stringr)
# Load data
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
# First, let's see what's in NOVDescription
cat("Sample NOVDescription text:\n")Sample NOVDescription text:
print(head(housing_raw$NOVDescription, 10)) [1] "(A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS."
[2] "§ 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH"
[3] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST"
[4] "§ 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD"
[5] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST"
[6] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"
[7] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING LOCATED AT APT 2A, 2nd STORY, 1st APARTMENT FROM WEST AT NORTH"
[8] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"
[9] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"
[10] "§ 27-2005 ADM CODE REPAIR THE BROKEN OR DEFECTIVE PLASTERED SURFACES AND PAINT IN A UNIFORM COLOR EAST WALL APPROX 4SQ FT AND NORTH WALL APPROX 4 SQ FT IN THE BATHROOM LOCATED AT APT 1A, 1st STORY, 2nd APARTMENT FROM SOUTH AT WEST"
# Extract key violation keywords from descriptions
housing_raw[, ViolationCategory := fcase(
# Heating/Hot Water
str_detect(NOVDescription, regex("heat|hot water|radiator|boiler|steam", ignore_case = TRUE)),
"HEAT/HOT WATER",
# Plumbing
str_detect(NOVDescription, regex("plumbing|pipe|leak|faucet|drain|water supply|washbasin", ignore_case = TRUE)),
"PLUMBING",
# Paint/Plaster
str_detect(NOVDescription, regex("paint|plaster|peeling|wall|ceiling", ignore_case = TRUE)),
"PAINT/PLASTER",
# Water Leak (specific)
str_detect(NOVDescription, regex("water leak|leaking|water damage", ignore_case = TRUE)),
"WATER LEAK",
# Elevator
str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)),
"ELEVATOR",
# Mold
str_detect(NOVDescription, regex("mold|mildew|moisture", ignore_case = TRUE)),
"MOLD",
# Building Structure
str_detect(NOVDescription, regex("structural|building condition|facade|exterior wall", ignore_case = TRUE)),
"BUILDING CONDITION",
# Electrical
str_detect(NOVDescription, regex("electric|wiring|outlet|light", ignore_case = TRUE)),
"ELECTRICAL",
# Pest/Sanitation
str_detect(NOVDescription, regex("rodent|pest|bedbug|garbage|sanitation", ignore_case = TRUE)),
"PEST/SANITATION",
# Door/Window
str_detect(NOVDescription, regex("door|window|lock|self-closing", ignore_case = TRUE)),
"DOOR/WINDOW",
# Default
default = "OTHER"
)]
# Check the distribution
cat("\nExtracted Violation Categories:\n")
Extracted Violation Categories:
print(housing_raw[, .N, by=ViolationCategory][order(-N)]) ViolationCategory N
<char> <int>
1: OTHER 183558
2: PAINT/PLASTER 176654
3: DOOR/WINDOW 92915
4: PEST/SANITATION 48138
5: PLUMBING 43787
6: HEAT/HOT WATER 39992
7: ELECTRICAL 2187
8: MOLD 1346
9: ELEVATOR 267
10: BUILDING CONDITION 87
11: WATER LEAK 74
# Compare with 311 Complaint Types
sr311_raw <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")
cat("\n\n311 Complaint Types:\n")
311 Complaint Types:
print(sr311_raw [, .N, by=`Complaint Type`][order(-N)]) Complaint Type N
<char> <int>
1: HEAT/HOT WATER 229222
2: PLUMBING 50637
3: PAINT/PLASTER 44624
4: WATER LEAK 30190
5: General Construction/Plumbing 25207
6: Maintenance or Facility 20202
7: Elevator 16588
8: Plumbing 2197
9: ELEVATOR 1302
10: Non-Residential Heat 1094
11: School Maintenance 1064
12: OUTSIDE BUILDING 857
13: Water Leak 556
14: Heat/Hot Water 487
15: Paint/Plaster 404
16: Mold 402
17: Building Drinking Water Tank 83
18: Sewer Maintenance 68
19: Water Maintenance 31
20: Building Condition 19
21: Unstable Building 14
Complaint Type N
library(data.table)
library(stringr)
# Load data
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
# STEP 1: Create ViolationCategory column first
housing_raw[, ViolationCategory := fcase(
# Heating/Hot Water
str_detect(NOVDescription, regex("heat|hot water|radiator|boiler|steam", ignore_case = TRUE)),
"HEAT/HOT WATER",
# Plumbing
str_detect(NOVDescription, regex("plumbing|pipe|faucet|drain|water supply|washbasin", ignore_case = TRUE)),
"PLUMBING",
# Paint/Plaster
str_detect(NOVDescription, regex("paint|plaster|peeling|wall|ceiling", ignore_case = TRUE)),
"PAINT/PLASTER",
# Water Leak (specific)
str_detect(NOVDescription, regex("leak", ignore_case = TRUE)),
"WATER LEAK",
# Elevator
str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)),
"ELEVATOR",
# Mold
str_detect(NOVDescription, regex("mold|mildew", ignore_case = TRUE)),
"MOLD",
# Building Structure
str_detect(NOVDescription, regex("structural|facade|exterior", ignore_case = TRUE)),
"BUILDING CONDITION",
# Electrical
str_detect(NOVDescription, regex("electric|wiring|outlet|light", ignore_case = TRUE)),
"ELECTRICAL",
# Pest/Sanitation
str_detect(NOVDescription, regex("rodent|pest|bedbug|garbage|sanitation", ignore_case = TRUE)),
"PEST/SANITATION",
# Door/Window
str_detect(NOVDescription, regex("door|window|lock|self-closing", ignore_case = TRUE)),
"DOOR/WINDOW",
# Default
default = "OTHER"
)]
# STEP 2: Now check distribution
cat("Extracted Violation Categories:\n")Extracted Violation Categories:
print(housing_raw[, .N, by=ViolationCategory][order(-N)]) ViolationCategory N
<char> <int>
1: PAINT/PLASTER 207685
2: OTHER 183562
3: DOOR/WINDOW 92596
4: PEST/SANITATION 48140
5: HEAT/HOT WATER 39992
6: PLUMBING 11015
7: ELECTRICAL 2187
8: WATER LEAK 1741
9: MOLD 1344
10: BUILDING CONDITION 475
11: ELEVATOR 268
# STEP 3: Check what's in OTHER
cat("\n\nSample 'OTHER' violations (first 10):\n")
Sample 'OTHER' violations (first 10):
print(head(housing_raw[ViolationCategory == "OTHER", NOVDescription], 10)) [1] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"
[2] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING LOCATED AT APT 2A, 2nd STORY, 1st APARTMENT FROM WEST AT NORTH"
[3] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"
[4] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"
[5] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 51, 5th STORY, 1st APARTMENT FROM SOUTH AT WEST"
[6] "HMC ADM CODE: § 27-2017.4 ABATE THE INFESTATION CONSISTING OF ROACHES IN THE ENTIRE APARTMENT LOCATED AT APT 1A, 2nd STORY, 1st APARTMENT FROM WEST AT NORTH"
[7] "§ 27-2005 ADM CODE PROPERLY REPAIR WITH SIMILAR MATERIAL THE BROKEN OR DEFECTIVE WOOD FLOOR CRACKED,SAGGING, NEAR THE BATHROOM. IN THE FOYER LOCATED AT APT 5F, 5th STORY, 2nd APARTMENT FROM EAST AT SOUTH"
[8] "§ 27-2005 ADM CODE PROPERLY REPAIR WITH SIMILAR MATERIAL THE BROKEN OR DEFECTIVE CERAMIC TILE AT FLOOR. IN THE BATHROOM LOCATED AT APT 5F, 5th STORY, 2nd APARTMENT FROM EAST AT SOUTH"
[9] "HMC ADM CODE: § 27-2017.4 ABATE THE INFESTATION CONSISTING OF ROACHES LOCATED AT APT 1C, 1st STORY, APARTMENT AT EAST"
[10] "§ 27-2053 ADM CODE PROVIDE DWELLING WITH A JANITOR OR RESPONSIBLE PERSON OR JANITORIAL SERVICE."
# STEP 4: Check Class distribution in OTHER
cat("\n\nClass distribution in OTHER:\n")
Class distribution in OTHER:
print(housing_raw[ViolationCategory == "OTHER", .N, by=Class][order(-N)]) Class N
<char> <int>
1: B 83067
2: C 53878
3: I 33666
4: A 12951
library(data.table)
library(stringr)
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
# Improved categorization
housing_raw[, ViolationCategory := fcase(
# Registration (Class I)
Class == "I" | str_detect(NOVDescription, regex("registration|register", ignore_case = TRUE)),
"REGISTRATION/ADMIN",
# Smoke/CO Detectors (VERY COMMON - separate category)
str_detect(NOVDescription, regex("smoke detector|carbon monoxide|co detect", ignore_case = TRUE)),
"SMOKE/CO DETECTOR",
# Heating/Hot Water
str_detect(NOVDescription, regex("heat|hot water|radiator|boiler|steam", ignore_case = TRUE)),
"HEAT/HOT WATER",
# Paint/Plaster/Walls/Ceilings
str_detect(NOVDescription, regex("paint|plaster|peel|wall|ceiling", ignore_case = TRUE)),
"PAINT/PLASTER",
# Plumbing
str_detect(NOVDescription, regex("plumb|pipe|faucet|drain|water supply|washbasin|toilet|sink", ignore_case = TRUE)),
"PLUMBING",
# Water Leak
str_detect(NOVDescription, regex("leak", ignore_case = TRUE)),
"WATER LEAK",
# Door/Window/Lock
str_detect(NOVDescription, regex("door|window|lock|self-closing|entrance", ignore_case = TRUE)),
"DOOR/WINDOW/LOCK",
# Pest/Sanitation
str_detect(NOVDescription, regex("rodent|pest|bedbug|bed bug|roach|mice|rat|garbage|sanitation|infest", ignore_case = TRUE)),
"PEST/SANITATION",
# Floor/Ceiling defects
str_detect(NOVDescription, regex("floor|tile|carpet|wood floor|ceramic", ignore_case = TRUE)),
"FLOOR/CEILING",
# Elevator
str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)),
"ELEVATOR",
# Mold
str_detect(NOVDescription, regex("mold|mildew|moisture", ignore_case = TRUE)),
"MOLD",
# Electrical
str_detect(NOVDescription, regex("electric|wiring|outlet|light", ignore_case = TRUE)),
"ELECTRICAL",
# Ventilation
str_detect(NOVDescription, regex("ventilat|airflow|exhaust fan", ignore_case = TRUE)),
"VENTILATION",
# Building Management
str_detect(NOVDescription, regex("janitor|superintendent|building service", ignore_case = TRUE)),
"BUILDING MANAGEMENT",
# Fire Safety
str_detect(NOVDescription, regex("fire|sprinkler|fire escape|extinguish", ignore_case = TRUE)),
"FIRE SAFETY",
# Default
default = "OTHER"
)]
# Check new distribution
cat("REFINED Violation Categories:\n")REFINED Violation Categories:
print(housing_raw[, .N, by=ViolationCategory][order(-N)]) ViolationCategory N
<char> <int>
1: PAINT/PLASTER 206753
2: PEST/SANITATION 96473
3: DOOR/WINDOW/LOCK 95429
4: SMOKE/CO DETECTOR 40227
5: HEAT/HOT WATER 39990
6: REGISTRATION/ADMIN 35133
7: OTHER 24747
8: FLOOR/CEILING 23140
9: PLUMBING 15515
10: FIRE SAFETY 7127
11: ELECTRICAL 1753
12: WATER LEAK 1731
13: BUILDING MANAGEMENT 377
14: MOLD 363
15: VENTILATION 166
16: ELEVATOR 81
cat("\n\nRemaining OTHER (should be much smaller now):\n")
Remaining OTHER (should be much smaller now):
cat("Count:", nrow(housing_raw[ViolationCategory == "OTHER"]), "\n")Count: 24747
print(head(housing_raw[ViolationCategory == "OTHER", NOVDescription], 5))[1] "§ 27-2026 ADM CODE RESET, SO AS TO SECURE A PROPER AND TIGHT CONNECTION, THE WATER CLOSET BOWL ... IN THE BATHROOM LOCATED AT APT 3A, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"
[2] "§ 27-2005 ADM CODE PROPERLY REPAIR THE BROKEN OR DEFECTIVE BELL BUZZER INTERCOM SYSTEM LOCATED AT APT 3A, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"
[3] "§ 27-2070 ADM CODE PROVIDE AN ADEQUATE SUPPLY OF GAS TO THE FIXTURES TO STOVE IN THE ENTIRE APARTMENT LOCATED AT APT 2B, 2nd STORY, 1st APARTMENT FROM EAST AT SOUTH"
[4] "§ 27-2005, 2007 ADM CODE REMOVE ALL ENCUMBRANCES CONSISTING OF BICYCLE AND HOUSEHOLD ITEMS AT PUBLIC HALL, 4th STORY"
[5] "§ 27-2024 ADM CODE PROVIDE ADEQUATE SUPPLY OF COLD WATER FOR THE FIXTURES NO RUNNING COLD WATER. IN THE ENTIRE APARTMENT LOCATED AT APT 24F, 24th STORY, 3rd APARTMENT FROM NORTH AT EAST"
library(data.table)
library(stringr)
library(ggplot2)
housing_raw <- fread("datasets/Housing_Violations_2022_onwards.csv")
sr311_raw <- fread("datasets/311_Housing_Complaints_2022_onwards.csv")
# Final refined categorization
housing_raw[, ViolationCategory := fcase(
Class == "I" | str_detect(NOVDescription, regex("registration|register", ignore_case = TRUE)),
"REGISTRATION/ADMIN",
str_detect(NOVDescription, regex("smoke detector|carbon monoxide|co detect", ignore_case = TRUE)),
"SMOKE/CO DETECTOR",
str_detect(NOVDescription, regex("heat|hot water|radiator|boiler|steam", ignore_case = TRUE)),
"HEAT/HOT WATER",
str_detect(NOVDescription, regex("paint|plaster|peel|wall|ceiling", ignore_case = TRUE)),
"PAINT/PLASTER",
str_detect(NOVDescription, regex("plumb|pipe|faucet|drain|water supply|washbasin|toilet|sink|water closet|cold water", ignore_case = TRUE)),
"PLUMBING",
str_detect(NOVDescription, regex("leak", ignore_case = TRUE)),
"WATER LEAK",
str_detect(NOVDescription, regex("door|window|lock|self-closing|entrance", ignore_case = TRUE)),
"DOOR/WINDOW/LOCK",
str_detect(NOVDescription, regex("rodent|pest|bedbug|bed bug|roach|mice|rat|garbage|sanitation|infest", ignore_case = TRUE)),
"PEST/SANITATION",
str_detect(NOVDescription, regex("floor|tile|carpet|wood floor|ceramic", ignore_case = TRUE)),
"FLOOR/CEILING",
str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)),
"ELEVATOR",
str_detect(NOVDescription, regex("mold|mildew|moisture", ignore_case = TRUE)),
"MOLD",
str_detect(NOVDescription, regex("electric|wiring|outlet|light", ignore_case = TRUE)),
"ELECTRICAL",
str_detect(NOVDescription, regex("gas|stove|appliance", ignore_case = TRUE)),
"GAS/APPLIANCES",
str_detect(NOVDescription, regex("ventilat|airflow|exhaust fan", ignore_case = TRUE)),
"VENTILATION",
str_detect(NOVDescription, regex("janitor|superintendent|building service", ignore_case = TRUE)),
"BUILDING MANAGEMENT",
str_detect(NOVDescription, regex("fire|sprinkler|fire escape|extinguish", ignore_case = TRUE)),
"FIRE SAFETY",
str_detect(NOVDescription, regex("bell|buzzer|intercom", ignore_case = TRUE)),
"BUILDING SYSTEMS",
default = "OTHER"
)]
# Comparison data
viol_counts <- housing_raw[, .N, by=ViolationCategory][order(-N)]
cat("Final Violation Categories:\n")Final Violation Categories:
print(viol_counts) ViolationCategory N
<char> <int>
1: PAINT/PLASTER 206753
2: DOOR/WINDOW/LOCK 95419
3: PEST/SANITATION 94741
4: SMOKE/CO DETECTOR 40227
5: HEAT/HOT WATER 39990
6: REGISTRATION/ADMIN 35133
7: FLOOR/CEILING 23076
8: PLUMBING 19386
9: OTHER 18061
10: FIRE SAFETY 7120
11: GAS/APPLIANCES 4008
12: ELECTRICAL 1750
13: WATER LEAK 1723
14: BUILDING SYSTEMS 679
15: BUILDING MANAGEMENT 377
16: MOLD 362
17: VENTILATION 119
18: ELEVATOR 81
# Map 311 to standardized categories for comparison
sr311_raw [, Category := fcase(
str_detect(`Complaint Type`, regex("HEAT|HOT WATER", ignore_case = TRUE)), "HEAT/HOT WATER",
str_detect(`Complaint Type`, regex("PLUMBING", ignore_case = TRUE)), "PLUMBING",
str_detect(`Complaint Type`, regex("PAINT|PLASTER", ignore_case = TRUE)), "PAINT/PLASTER",
str_detect(`Complaint Type`, regex("WATER LEAK", ignore_case = TRUE)), "WATER LEAK",
str_detect(`Complaint Type`, regex("ELEVATOR", ignore_case = TRUE)), "ELEVATOR",
str_detect(`Complaint Type`, regex("MOLD", ignore_case = TRUE)), "MOLD",
default = "OTHER"
)]
complaints_counts <- sr311_raw [, .N, by=Category][order(-N)]
cat("\n311 Mapped Categories:\n")
311 Mapped Categories:
print(complaints_counts) Category N
<char> <int>
1: HEAT/HOT WATER 230803
2: PLUMBING 78041
3: PAINT/PLASTER 45028
4: WATER LEAK 30746
5: OTHER 22338
6: ELEVATOR 17890
7: MOLD 402
library(data.table)
library(ggplot2)
library(tidyr)
# Prepare data for comparison - focus on categories that exist in BOTH datasets
comparison_categories <- c("HEAT/HOT WATER", "PLUMBING", "PAINT/PLASTER",
"WATER LEAK", "ELEVATOR", "MOLD")
# Get counts for violations
viol_comparison <- housing_raw[ViolationCategory %in% comparison_categories,
.(Count = .N), by = ViolationCategory]
viol_comparison[, Source := "Housing Violations"]
setnames(viol_comparison, "ViolationCategory", "Category")
# Get counts for 311 complaints
complaints_comparison <- sr311_raw [Category %in% comparison_categories,
.(Count = .N), by = Category]
complaints_comparison[, Source := "311 Complaints"]
# Combine
combined <- rbind(viol_comparison, complaints_comparison)
# Create grouped bar chart
ggplot(combined, aes(x = reorder(Category, -Count), y = Count, fill = Source)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = scales::comma(Count)),
position = position_dodge(width = 0.9),
vjust = -0.5, size = 4) +
scale_fill_manual(values = c("Housing Violations" = "#FF6B6B",
"311 Complaints" = "#4ECDC4")) +
scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Violations vs 311 Complaints by Category",
subtitle = "Manhattan, 2022-2025 - Comparable Categories Only",
x = "Category",
y = "Count",
fill = "Source"
) +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "top"
)